## Warning: package 'caret' was built under R version 3.5.2
## Warning: package 'doMC' was built under R version 3.5.2
Data from 3 surveys:
All have a range of (overlapping) demographic variables AND ask about support in the 2018 midterms.
# import data
data_sept18 = data.table(read.spss('data/Sept18/Sept18 public.sav', to.data.frame = T), stringsAsFactors = F)
data_june18 = data.table(read.spss('data/June18/June18 public.sav', to.data.frame = T), stringsAsFactors = F)
data_may18 = data.table(read.spss('data/May18/May18 public.sav', to.data.frame = T), stringsAsFactors = F)
data_sept18
data_may18[, .N, party]
The idea of this approach is to find a way to leverage the extensive/valuable data on the voterfile to outcomes observed in a survey (where more limited covariates are collected). Therefore, we need to assign variables to where we’re saying they’re observed (in the survey, in the voterfile or in both).
# specify where each var is observed (survey, file or both)
survey_vars = c('demo_mode', 'demo_education', 'demo_phonetype', 'month_called', 'demo_ideology', 'demo_party')
file_and_survey_vars = c('demo_sex', 'demo_age_bucket', 'demo_state', 'demo_income', 'demo_region', 'demo_race', 'demo_hispanic')
# RECODE all data sets
data_recoded = rbindlist(lapply(list(data_sept18, data_may18, data_june18), doPewRecode))
data_recoded
Create a data table of all of the covars we have to work with
# create data table with vars and levels
covars = names(data_recoded)[grepl('demo', names(data_recoded))]
covars = data.table(do.call(rbind, lapply(covars, function(c){
cbind(c, data_recoded[, .(level = unique(get(c)))][order(level)])
})))
setnames(covars, c('var', 'level'))
covars[, level_modmat := paste0(var, level)]
covars[, in_survey := as.numeric(var %in% survey_vars)]
covars[, in_both := as.numeric(var %in% file_and_survey_vars)]
covars[, in_file := as.numeric(in_survey + in_both == 0)]
covars[, .N, .(in_survey, in_file, in_both)]
We need to divide the data into the following partitions:
We also want to mirror real conditions, in that the survey data is not selected completely at random from the voterfile - it should be biased. The same is true for the survey data that we can match to the voterfile. Therefore, we specify a “surveyed mechanism” and a “matched mechanism” based on my knowledge of common patterns observed in real situations.
Generally, those who respond to surveys tend to be older, unemployed/retired/disabled, partisan, more educated, have white collar jobs and reached on cell phones. We are more likely to be able to match someone to the voterfile if they are higher income (more consumer data available on them), reached on a cell-phone, older, white, live in a smaller household. These trends are reflected in the formula for \(p_\text{missing}\) and \(p_{matched}\) specified below.
# scale age and set NAs to 0
data_recoded[, age_scaled := scale(age_num)]
data_recoded[is.na(age_scaled), age_scaled := 0]
data_recoded[, p_surveyed :=
(-2)
+ 2 * age_scaled
- 0.5 * is.na(age_num)
+ 1.5 * as.numeric(demo_mode == 'cell')
- 1.5 * as.numeric(demo_party == '05-Ind')
- 3 * as.numeric(demo_party == "99-DK/refused")
+ 1.5 * as.numeric(demo_education %in% c('01-postgrad', '02-bach'))
+ 3 * as.numeric(demo_ideology == 'Very conservative' | demo_ideology == 'Very liberal')
]
data_recoded[, p_surveyed := exp(p_surveyed)/(1 + exp(p_surveyed))]
hist(data_recoded[, p_surveyed])
data_recoded[, .(.N, mean(p_surveyed)), .(demo_age_bucket)][order(demo_age_bucket)]
data_recoded[, .(.N, mean(p_surveyed)), .(demo_mode)][order(demo_mode)]
data_recoded[, .(.N, mean(p_surveyed)), .(demo_party)][order(demo_party)]
data_recoded[, .(.N, mean(p_surveyed)), .(demo_ideology)][order(demo_ideology)]
data_recoded[, p_matched := NULL]
## Warning in `[.data.table`(data_recoded, , `:=`(p_matched, NULL)): Adding
## new column 'p_matched' then assigning NULL (deleting it).
data_recoded[, p_matched :=
-2 +
-2 * as.numeric(demo_mode == 'landline')
+ 3 * as.numeric(demo_race == 'W')
+ -2 * as.numeric(demo_reg == '03-No')
+ -1 * as.numeric(demo_hhsize == 2)
+ -2 * as.numeric(demo_hhsize == 3)
+ 2 *age_scaled
+ as.numeric(demo_income)/3
- 4* as.numeric(demo_income == '99-DK/refused')
]
data_recoded[, p_matched := exp(p_matched)/(1 + exp(p_matched))]
hist(data_recoded$p_matched)
data_recoded[, .(.N, mean(p_matched)), demo_mode]
data_recoded[, .(.N, mean(p_matched)), demo_hispanic]
data_recoded[, .(.N, mean(p_matched)), demo_age_bucket][order(demo_age_bucket)]
Check correlation between the generated probabilities
ggplot(data_recoded, aes(x = p_surveyed, y = p_matched)) + geom_point()
Generate actual partitions based on \(p_\text{surveyed}\) and \(p_\text{matched}\). The holdout (test) set is selected first using \(p=1/n\) for all \(n\) units in the Pew data. From the remaining data, we first select the set of observed survey data proportional to \(p_\text{surveyed}\). Last, from the surveyed data, we select the subset of data that matches to the voterfile with probability \(p_\text{matched}\).
testtrain = getTestTrain(data = data_recoded
, n_holdout = 1000, n_surveyed = 2000, n_matched = 1000
, p_surveyed = data_recoded$p_surveyed
, p_matched = data_recoded$p_matched
)
## Warning in `[.data.table`(data, , `:=`(holdout, NULL)): Adding new column
## 'holdout' then assigning NULL (deleting it).
## Warning in `[.data.table`(data, , `:=`(surveyed, NULL)): Adding new column
## 'surveyed' then assigning NULL (deleting it).
## Warning in `[.data.table`(data, , `:=`(matched, NULL)): Adding new column
## 'matched' then assigning NULL (deleting it).
data_recoded = testtrain$data
# N and pct voting dem by partition
data_recoded[, .(.N, mean(y_dem)), list(holdout, surveyed, matched, voterfile)]
# check prop surveyed and matched by demos
data_recoded[, .(.N, prop_surveyed = mean(surveyed), prop_matched = sum(matched)/sum(surveyed), overall_matched = mean(matched)), demo_age_bucket][order(demo_age_bucket)]
data_recoded[, .(.N, prop_surveyed = mean(surveyed), prop_matched = sum(matched)/sum(surveyed), overall_matched = mean(matched)), demo_party][order(demo_party)]
First, we’ll walk through each step in distribution regression, fixing hyperparameters:
These steps are implemented below.
#### Create bags of survey responses using variables observed in BOTH survey AND voterfile
# should we make the bags with the subset of data from the survey, or the whole file???
# sigma = 0.003 ## chosen with median heuristic
n_bags = 50
n_landmarks = 12
bags = getBags(data = data_recoded[surveyed == 1,]
, vars = file_and_survey_vars
, n_bags = n_bags
, newdata = data_recoded[voterfile == 1 | holdout == 1, ])
table(bags$bags)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 30 29 45 47 63 35 28 66 40 36 25 63 29 58 26 30 52 57 57 40 46 46 17 73 47
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 24 66 32 18 28 44 35 23 85 41 24 51 45 37 61 31 33 42 14 15 47 31 13 27 48
length(unique(bags$bags))
## [1] 50
data_recoded[surveyed == 1, bag := bags$bags]
data_recoded[voterfile == 1 | holdout == 1, bag := bags$bags_newdata]
data_recoded[, .(.N, mean(surveyed)), .(bag)][order(bag)]
landmarks = getLandmarks(data = data_recoded
, vars = unique(covars[in_both == 1 | in_file == 1,]$var)
, n_landmarks = n_landmarks
, subset_ind = (data_recoded$voterfile == 1))
X_file = landmarks$X[data_recoded$voterfile == 1, ]
Choose scale parameter \(\sigma\) for RBF kernel - use median heuristic for now
rbf1 = rbfdot(sigma = 1)
K_sigma = kernelMatrix(rbf1, x = as.matrix(X_file), y = landmarks$landmarks)
sigma = median(K_sigma)
train_bag = data_recoded[voterfile == 1, bag]
test_bag = data_recoded[holdout == 1, bag]
# get features
features = getFeatures(data = landmarks$X
, bag = data_recoded$bag
, train_ind = data_recoded$voterfile # the col of indicators for the training set
, landmarks = landmarks$landmarks
, sigma = sigma)
# calculate dependent var in each bag
Y_svy_bag = data_recoded[surveyed == 1, .(y_mean = mean(y_dem)), bag][order(bag)]
# make sure Y has all levels
Y_svy_bag = merge(data.table(bag = 1:n_bags), Y_svy_bag, all.x = T)
Y_svy_bag[is.na(y_mean), y_mean := 0]
# do basic DR
fit_basicDR = fitLasso(mu_hat = features$mu_hat
, Y_bag = Y_svy_bag$y_mean
, phi_x = features$phi_x
)
# score the file
data_recoded[, y_dem_basicdr := fit_basicDR$Y]
calcMSE(Y = data_recoded$y_dem, Y_pred = data_recoded$y_dem_basicdr)
## [1] 0.2713995
data_recoded[voterfile == 1 | holdout == 1, y_dem_basicdr_dec := cut(y_dem_basicdr, breaks = quantile(y_dem_basicdr, probs = seq(0,1,0.1)), labels = 1:10, include.lowest = T)]
ggplot(data_recoded[voterfile == 1, .(pct_y_dem = mean(y_dem)), by = y_dem_basicdr_dec], aes(x = y_dem_basicdr_dec, y = pct_y_dem)) +
geom_bar(stat = 'identity') +
ggtitle("Pct Dem supporter by score decile - voterfile")
ggplot(data_recoded[holdout == 1, .(pct_y_dem = mean(y_dem)), by = y_dem_basicdr_dec], aes(x = y_dem_basicdr_dec, y = pct_y_dem)) +
geom_bar(stat = 'identity') +
ggtitle("Pct Dem supporter by score decile - holdout")
Use hyperband algorithm described here - https://medium.com/criteo-labs/hyper-parameter-optimization-algorithms-2fe447525903
Main advantage of this method are that it runs multiple iterations with the same parameters (there’s still an element of randomness in how the initital centroids are chosen in each k-means run, so this helps with evaluating mean performance of each set of hyperparams).
Disadvantage is that we only consider 64 randomly-selected sets of hyperparameters, which may not fully explore the space enough. Though the hyperparameters chosen do seem to vastly improve performance over initial (non-tuned) choices.
library(parallel)
library(MASS)
# takes a while, so don't run by accident/automatically
if(FALSE){
# https://medium.com/criteo-labs/hyper-parameter-optimization-algorithms-2fe447525903
n_param_sets = 64
results = data.table(sigma = exp(runif(min = -15, max = -1, n = n_param_sets))
, n_landmarks = round(runif(min = 10, max = 400, n = n_param_sets))
, n_bags = round(exp(runif(min = 2, max = 5, n = n_param_sets)))
, round = rep(0, n_param_sets)
, mse = rep(0, n_param_sets)
, mse_count = rep(0, n_param_sets)
)
#setnames(results, c('sigma','n_landmarks', 'n_bags', 'mse'))
#initialize counters
which_left = 1:n_param_sets
n_iter = 10
while(length(which_left) > 1){
cat(paste(Sys.time(), "round: ", max(results[, round] + 1)), ', remaining: ',length(which_left),'\n')
# double the number of iterations
#if(this_round > 3) n_iter = 50
for(p in which_left){
#cat(paste('\t param:', p, '\n'))
# increment round
results[p, round := round + 1]
mse_sum = unlist(mclapply(1:n_iter, function(i){
tryCatch(doBasicDR(data = data_recoded
, bagging_vars = file_and_survey_vars
, regression_vars = unique(covars[in_both == 1 | in_file == 1,]$var)
#, outcome = 'y_dem'
, outcome = c('y_dem', 'y_rep', 'y_oth')
, family = 'multinomial'
, n_bags = results[p, n_bags]
, n_landmarks = results[p, n_landmarks]
, sigma = results[p, sigma]
, bagging_ind = 'surveyed'
, train_ind = 'voterfile'
, test_ind = 'holdout'
)$mse_test
, error = function(e) {
print(e)
return(0)})
}, mc.cores = 5 #need this for parallelization
))
results[p, mse := mse + sum(mse_sum)]
results[p, mse_count := mse_count + sum(mse_sum > 0)]
}
which_left = which(results[which_left, mse/mse_count < median(mse/mse_count, na.rm = T)])
}
# best
results[round == max(round)][order(mse)]
}
bagging_vars = file_and_survey_vars
regression_vars = unique(covars[in_both == 1 | in_file == 1,]$var)
# params from per_hyperparam_opt.R
fit_basicDR = doBasicDR(data = data_recoded
, bagging_vars = file_and_survey_vars
, regression_vars = regression_vars
, outcome = 'y_dem'
, n_bags = 95
, n_landmarks = 45
, sigma = 0.004
, bagging_ind = 'surveyed'
, train_ind = 'voterfile'
, test_ind = 'holdout')
## 2019-08-01 10:18:53 Making bags
## 2019-08-01 10:18:53 Getting landmarks
## 2019-08-01 10:18:54 Making features
## Warning in doBasicDR(data = data_recoded, bagging_vars = file_and_survey_vars, : not all bags in voterfile
## 2019-08-01 10:18:54 Fitting model
# get score deciles
fit_basicDR$data[voterfile == 1 | holdout == 1, y_hat_dec := cut(y_dem_hat, breaks = quantile(y_dem_hat, probs = seq(0,1,0.1)), labels = 1:10, include.lowest = T)]
# plot voterfile data
ggplot(fit_basicDR$data[voterfile == 1, .(pct_y_hat = mean(y_dem_hat)), by = y_hat_dec], aes(x = y_hat_dec, y = pct_y_hat)) +
geom_bar(stat = 'identity') +
ggtitle("Pct Dem supporter by score decile - voterfile")
# plot holdout data
ggplot(fit_basicDR$data[holdout == 1, .(pct_y_hat = mean(y_dem_hat)), by = y_hat_dec], aes(x = y_hat_dec, y = pct_y_hat)) +
geom_bar(stat = 'identity') +
ggtitle("Pct Dem supporter by score decile - holdout")
As a baseline, we’ll fit a simple LASSO using only the \(n_matched\) observations that were matched to the file. We fit the LASSO once to find which coefs are non-zero, and then re-fit with only those covars and no penalty to avoid shrinkage.
The other baseline we’ll use is the mean in each bag observed in the survey data (matched and unmatched). This is our dependent variable for the distribution regression.
Fit, predict, and plot results
X_matched = landmarks$X[data_recoded$matched == 1, ]
lasso_fit = cv.glmnet(x = X_matched
, y = data_recoded[matched == 1, ]$y_dem
, nfolds = 10
, family = 'binomial')
nonzero_ind = which(coef(lasso_fit, s = 'lambda.min')[-1] != 0)
# re-fit to avoid shrinkage
lasso_fit = glmnet(x = X_matched[, nonzero_ind]
, y = data_recoded[matched == 1, ]$y_dem
, lambda = 0
, family = 'binomial')
# predict on full dataset
data_recoded[, y_hat_lasso := predict(lasso_fit, newx = landmarks$X[, nonzero_ind], type = 'response')]
# get score deciles
data_recoded[holdout == 1, y_hat_lasso_dec := cut(y_hat_lasso, breaks = quantile(y_hat_lasso, probs = seq(0,1,0.1)), labels = 1:10, include.lowest = T)]
# plot avg outcome by score decile
ggplot(data_recoded[holdout == 1, .(pct_y_hat = mean(y_hat_lasso)), by = y_hat_lasso_dec], aes(x = y_hat_lasso_dec, y = pct_y_hat)) +
geom_bar(stat = 'identity') +
ggtitle("Pct Dem supporter by score decile - holdout")
Merge bag average with full file
data_recoded[Y_svy_bag, on = 'bag', y_bagmean := i.y_mean]
Calculate the MSE of each the basic DR and the LASSO
data.frame(method = c("Dist Reg", "Logit", "Bag mean")
, MSE = c(calcMSE(Y = data_recoded[holdout == 1, y_dem], data_recoded[holdout == 1, y_dem_hat])
, calcMSE(Y = data_recoded[holdout == 1, y_dem], data_recoded[holdout == 1, y_hat_lasso])
, calcMSE(Y = data_recoded[holdout == 1, y_dem], data_recoded[holdout == 1, y_bagmean])))
See how well each method is predicting the overall topline % dem support (bias)
data_recoded[, .(actual = mean(y_dem)
, basicdr = mean(y_dem_hat)
, logit = mean(y_hat_lasso)
, groupavg = mean(y_bagmean, na.rm =T))
, by = .(partition = ifelse(holdout == 1, 'holdout', ifelse(voterfile == 1, 'voterfile', ifelse(matched == 1, 'survey-matched', 'survey-unmatched'))))]
Now we’ll switch to a multinomial outcome \(\mathbf{y} = (y_\text{dem}, y_\text{rep}, y_\text{other})\)
outcome = c('y_dem', 'y_rep', 'y_oth')
# params from per_hyperparam_opt.R
fit_basicDR = doBasicDR(data = data_recoded
, bagging_vars = file_and_survey_vars
, regression_vars = unique(covars[in_both == 1 | in_file == 1,]$var)
, outcome = outcome
, n_bags = 131
, n_landmarks = 32
, sigma = 0.006
, family = 'multinomial'
, bagging_ind = 'surveyed'
, train_ind = 'voterfile'
, test_ind = 'holdout')
## 2019-08-01 10:18:56 Making bags
## 2019-08-01 10:18:56 Getting landmarks
## 2019-08-01 10:18:56 Making features
## Warning in doBasicDR(data = data_recoded, bagging_vars = file_and_survey_vars, : not all bags in voterfile
## 2019-08-01 10:18:56 Fitting model
fit_basicDR$mse_test
## [1] 0.1752895
Multinomial group means for comparison
Y_grp_means = fit_basicDR$data[surveyed == 1, lapply(.SD, mean), .SDcols = outcome, by = bag]
setnames(Y_grp_means, c('bag',paste0(outcome, '_grpmean')))
data_recoded = merge(data_recoded, Y_grp_means, by = 'bag', all.x = T)
X_matched = landmarks$X[data_recoded$matched == 1, ]
logit_multinom = cv.glmnet(x = X_matched
, y = as.matrix(data_recoded[matched == 1, which(names(data_recoded) %in% outcome), with = F])
#, nfolds = 10
, family = 'multinomial'
#, alpha = 0.2
, parallel = T
)
nonzero_ind = sort(unique(unlist(lapply(coef(logit_multinom, s = 'lambda.min'), function(c){
which(c[-1] != 0)
}))))
cat(nonzero_ind)
## 17 23 53 55 72
if(length(nonzero_ind) == 0){
nonzero_ind = 1
nonzero_ind = 1:10
}
# # re-fit to avoid shrinkage
logit_multinom = glmnet(x = X_matched[, nonzero_ind]
, y = as.matrix(data_recoded[matched == 1, which(names(data_recoded) %in% outcome), with = F])
, lambda = 0
, family = 'multinomial')
# predict on full dataset
data_recoded[, c('y_logit_dem', 'y_logit_rep', 'y_logit_oth') := as.list(data.table(matrix(predict(logit_multinom, newx = landmarks$X[, nonzero_ind], type = 'response'), ncol= 3)))]
head(data_recoded[, c('y_logit_dem', 'y_logit_rep', 'y_logit_oth')])
# compare MSE
data.frame(
method = c('Dist Reg', 'Logit','Bag avg')
, MSE = c(calcMSE(Y = data_recoded[holdout == 1, get(outcome)], data_recoded[holdout == 1, get(paste0(outcome, '_hat'))])
, calcMSE(Y = data_recoded[holdout == 1, get(outcome)], data_recoded[holdout == 1, get(c('y_logit_dem', 'y_logit_rep', 'y_logit_oth'))])
, calcMSE(Y = data_recoded[holdout == 1, get(outcome)], data_recoded[holdout == 1, get(c('y_dem_grpmean', 'y_rep_grpmean', 'y_oth_grpmean'))]))
)